install some packages or libraries

library(ggplot2)
library(ezids)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)

Data Shaping

Read Data

Read the data .CSV files of adults arrest in DC area from 2016-2021

df_2016<-data.frame(read.csv("Arrests 2016 Public.csv"))
df_2017<-data.frame(read.csv("Arrests 2017 Public.csv"))
df_2018<-data.frame(read.csv("Arrests by Year, 2018.csv"))
df_2019<-data.frame(read.csv("Arrests by Year, 2019.csv"))
df_2020<-data.frame(read.csv("Arrests by Year 2020.csv"))
df_2021<-data.frame(read.csv("2021 Adult Arrests.csv"))

c16 <- c(colnames(df_2016))
c18 <- c(colnames(df_2018))

Note that the colnames of df_2016 and df_2017 are not the same with others. We show the columns name of df_2016 and df_2018 in the below:

col # 2016 2018
1 Arrestee.Type Arrestee.Type
2 Arrest.Year Arrest.Year
3 Arrest.Date Arrest.Date
4 Arrest.Hour Arrest.Hour
5 CCN CCN
6 Arrest.Number. Arrest.Number.
7 Age Age
8 Defendant.PSA Defendant.PSA
9 Defendant.District Defendant.District
10 Defendant.Race Defendant.Race
11 Defendant.Ethnicity Defendant.Ethnicity
12 Defendant.Sex Defendant.Sex
13 Arrest.Category Arrest.Category
14 Charge.Description Charge.Description
15 Arrest.Location.PSA Arrest.Location.PSA
16 Arrest.Location.District Arrest.Location.District
17 Arrest.Location.Block.GeoX Arrest.Block.GEOX
18 Arrest.Location.Block.GeoY Arrest.Block.GEOY
19 Offense.GEOY Arrest.Latitude
20 Offense.GEOX Arrest.Longitude
21 Offense.PSA Offense.Location.PSA
22 Offense.District Offense.Location.District
23 Arrest.Latitude Offense.Block.GEOX
24 Arrest.Longitude Offense.Block.GEOY
25 Offense.Latitude Offense.Latitude
26 Offense.Longitude Offense.Longitude

Merge Multiple CSV Files

You can see that the columns name are same from the first column to the 14th column in both data. On the other hand, the name and order of 15th and latter columns are a bit different in those data. The latter columns are about locations, and we are not very interested in the detail location. Therefore, we will delete the latter columns except for the 16th and 22nd columns. In addition, we will drop CNN (col #5) and Arrest.Number. (col #6) because they are IDs and useless for our analysis.

After deleting some columns, we will bind data frames by rows.

#bind df_2016 and df_2017, and delete some columns
df_16_17 <- rbind(df_2016, df_2017)[,-c(5,6,15,17:21,23:26)]
names(df_16_17)[c(13,14)] <- c('Arrest.Location.District','Offense.Location.District')   #rename columns

#bind df_2018 - df_2021, and delete some columns
df_18_21 <- rbind(df_2018, df_2019, df_2020, df_2021)[,-c(5,6,15,17:21,23:26)] 

DF<-rbind(df_16_17,df_18_21)
colnames(DF)
##  [1] "Arrestee.Type"             "Arrest.Year"              
##  [3] "Arrest.Date"               "Arrest.Hour"              
##  [5] "Age"                       "Defendant.PSA"            
##  [7] "Defendant.District"        "Defendant.Race"           
##  [9] "Defendant.Ethnicity"       "Defendant.Sex"            
## [11] "Arrest.Category"           "Charge.Description"       
## [13] "Arrest.Location.District"  "Offense.Location.District"
str(DF)
## 'data.frame':    152386 obs. of  14 variables:
##  $ Arrestee.Type            : chr  "Adult Arrest" "Adult Arrest" "Adult Arrest" "Adult Arrest" ...
##  $ Arrest.Year              : int  2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
##  $ Arrest.Date              : chr  "2016-01-01" "2016-01-01" "2016-01-01" "2016-01-01" ...
##  $ Arrest.Hour              : int  0 0 0 0 0 0 1 1 1 1 ...
##  $ Age                      : int  39 27 21 52 33 22 26 27 23 34 ...
##  $ Defendant.PSA            : chr  "Out of State" "Out of State" "Out of State" "603" ...
##  $ Defendant.District       : chr  "Out of State" "Out of State" "Out of State" "6D" ...
##  $ Defendant.Race           : chr  "WHITE" "WHITE" "BLACK" "BLACK" ...
##  $ Defendant.Ethnicity      : chr  "UNKNOWN" "NOT HISPANIC" "NOT HISPANIC" "NOT HISPANIC" ...
##  $ Defendant.Sex            : chr  "MALE" "MALE" "MALE" "MALE" ...
##  $ Arrest.Category          : chr  "Simple Assault" "Simple Assault" "Assault on a Police Officer" "Traffic Violations" ...
##  $ Charge.Description       : chr  "Threats To Do Bodily Harm -misd" "Simple Assault" "Assault On A Police Officer (simple Assault)" "Permit Suspended-oas" ...
##  $ Arrest.Location.District : chr  "2D" "3D" "3D" "6D" ...
##  $ Offense.Location.District: chr  "2D" "3D" "3D" "6D" ...
xkablesummary(DF)
Table: Statistics summary.
Arrestee.Type Arrest.Year Arrest.Date Arrest.Hour Age Defendant.PSA Defendant.District Defendant.Race Defendant.Ethnicity Defendant.Sex Arrest.Category Charge.Description Arrest.Location.District Offense.Location.District
Min Length:152386 Min. :2016 Length:152386 Min. : 0.00 Min. : 18.00 Length:152386 Length:152386 Length:152386 Length:152386 Length:152386 Length:152386 Length:152386 Length:152386 Length:152386
Q1 Class :character 1st Qu.:2017 Class :character 1st Qu.: 6.00 1st Qu.: 25.00 Class :character Class :character Class :character Class :character Class :character Class :character Class :character Class :character Class :character
Median Mode :character Median :2018 Mode :character Median :12.00 Median : 32.00 Mode :character Mode :character Mode :character Mode :character Mode :character Mode :character Mode :character Mode :character Mode :character
Mean NA Mean :2018 NA Mean :11.81 Mean : 35.19 NA NA NA NA NA NA NA NA NA
Q3 NA 3rd Qu.:2019 NA 3rd Qu.:18.00 3rd Qu.: 43.00 NA NA NA NA NA NA NA NA NA
Max NA Max. :2021 NA Max. :23.00 Max. :121.00 NA NA NA NA NA NA NA NA NA

Drop some rows

Since we are interested in crimes committed by while males, we will drop rows where the value of Defendant.Race is not “White”.

In addition, Arrest.Category has some different values for 2021 and other years:

  • Data in 2021 has “Release Violations/Fugitive (Fug)” and “Release Violations/Fugitive (Warr)” although data in other years have “Release Violations/Fugitive” instead of them.
  • Data in 2021 has “Fraud and Financial Crimes (Frau)” although data in other years have “Fraud and Financial Crimes”.

Therefore, we wil covert these values in 2021 into the correspond values in other years.

DF_WM <- subset(DF, subset = Defendant.Race=='WHITE' & Defendant.Sex=='MALE')
DF_WM <- mutate(DF_WM, Arrest.Category = gsub(Arrest.Category, pattern = "Release Violations/Fugitive.*", replacement = "Release Violations/Fugitive"))
DF_WM <- mutate(DF_WM, Arrest.Category = gsub(Arrest.Category, pattern = "Fraud and Financial Crimes.*", replacement = "Fraud and Financial Crimes"))
str(DF_WM)
## 'data.frame':    12283 obs. of  14 variables:
##  $ Arrestee.Type            : chr  "Adult Arrest" "Adult Arrest" "Adult Arrest" "Adult Arrest" ...
##  $ Arrest.Year              : int  2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
##  $ Arrest.Date              : chr  "2016-01-01" "2016-01-01" "2016-01-01" "2016-01-01" ...
##  $ Arrest.Hour              : int  0 0 1 1 13 2 3 3 6 7 ...
##  $ Age                      : int  39 27 27 26 48 25 21 41 29 22 ...
##  $ Defendant.PSA            : chr  "Out of State" "Out of State" "Out of State" "Out of State" ...
##  $ Defendant.District       : chr  "Out of State" "Out of State" "Out of State" "Out of State" ...
##  $ Defendant.Race           : chr  "WHITE" "WHITE" "WHITE" "WHITE" ...
##  $ Defendant.Ethnicity      : chr  "UNKNOWN" "NOT HISPANIC" "HISPANIC" "NOT HISPANIC" ...
##  $ Defendant.Sex            : chr  "MALE" "MALE" "MALE" "MALE" ...
##  $ Arrest.Category          : chr  "Simple Assault" "Simple Assault" "Driving/Boating While Intoxicated" "Simple Assault" ...
##  $ Charge.Description       : chr  "Threats To Do Bodily Harm -misd" "Simple Assault" "Driving While Intoxicated -2nd Off" "Simple Assault" ...
##  $ Arrest.Location.District : chr  "2D" "3D" "4D" "5D" ...
##  $ Offense.Location.District: chr  "2D" "3D" "4D" "5D" ...
xkablesummary(DF_WM)
Table: Statistics summary.
Arrestee.Type Arrest.Year Arrest.Date Arrest.Hour Age Defendant.PSA Defendant.District Defendant.Race Defendant.Ethnicity Defendant.Sex Arrest.Category Charge.Description Arrest.Location.District Offense.Location.District
Min Length:12283 Min. :2016 Length:12283 Min. : 0.00 Min. : 18.00 Length:12283 Length:12283 Length:12283 Length:12283 Length:12283 Length:12283 Length:12283 Length:12283 Length:12283
Q1 Class :character 1st Qu.:2017 Class :character 1st Qu.: 4.00 1st Qu.: 25.00 Class :character Class :character Class :character Class :character Class :character Class :character Class :character Class :character Class :character
Median Mode :character Median :2018 Mode :character Median :11.00 Median : 32.00 Mode :character Mode :character Mode :character Mode :character Mode :character Mode :character Mode :character Mode :character Mode :character
Mean NA Mean :2018 NA Mean :11.17 Mean : 34.58 NA NA NA NA NA NA NA NA NA
Q3 NA 3rd Qu.:2019 NA 3rd Qu.:18.00 3rd Qu.: 42.00 NA NA NA NA NA NA NA NA NA
Max NA Max. :2021 NA Max. :23.00 Max. :121.00 NA NA NA NA NA NA NA NA NA

EDA

The number of crimes is as follows. Crime occurrences seem to have decreased after COVID-19.

year the number of crimes
2016 2620
2017 2636
2018 2298
2019 2193
2020 1426
2021 1110
2016 - 2019 (before COVID-19) 2437
2020 - 2021 (after COVID-19) 1268

Including Plots

Bar plot

We will create some bar plots to see the number of occurrences per type of crime.
The Bar plot of crimes in 2016 - 2021 is as follows:

ggplot(DF_WM, aes(forcats::fct_infreq(Arrest.Category))) +
  ggtitle("Figure 1: Bar plot of crimes in 2016 - 2021") + xlab("crime types") + geom_bar() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

The Bar plots of crimes before COVID-19 (2016 - 2019) and after COVID-19 (2020 - 2021) are as follows:

ggplot(subset(DF_WM,Arrest.Year <= 2019), aes(forcats::fct_infreq(Arrest.Category))) +
  ggtitle("Figure 2: Bar plot of crimes befor COVID-19") + xlab("crime types") + geom_bar() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

ggplot(subset(DF_WM,Arrest.Year > 2019), aes(forcats::fct_infreq(Arrest.Category))) +
  ggtitle("Figure 3: Bar plot of crimes after COVID-19") + xlab("crime types") + geom_bar() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

The Bar plots of crimes in each year are as follows:

ggplot(subset(DF_WM,Arrest.Year == 2016), aes(forcats::fct_infreq(Arrest.Category))) +
  ggtitle("Figure 4: Bar plot of crimes in 2016") + xlab("crime types") + geom_bar() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

ggplot(subset(DF_WM,Arrest.Year == 2017), aes(forcats::fct_infreq(Arrest.Category))) +
  ggtitle("Figure 5: Bar plot of crimes in 2017") + xlab("crime types") + geom_bar() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

ggplot(subset(DF_WM,Arrest.Year == 2018), aes(forcats::fct_infreq(Arrest.Category))) +
  ggtitle("Figure 6: Bar plot of crimes in 2018") + xlab("crime types") + geom_bar() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

ggplot(subset(DF_WM,Arrest.Year == 2019), aes(forcats::fct_infreq(Arrest.Category))) +
  ggtitle("Figure 7: Bar plot of crimes in 2019") + xlab("crime types") + geom_bar() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

ggplot(subset(DF_WM,Arrest.Year == 2020), aes(forcats::fct_infreq(Arrest.Category))) +
  ggtitle("Figure 8: Bar plot of crimes in 2020") + xlab("crime types") + geom_bar() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

ggplot(subset(DF_WM,Arrest.Year == 2021), aes(forcats::fct_infreq(Arrest.Category))) +
  ggtitle("Figure 9: Bar plot of crimes in 2021") + xlab("crime types") + geom_bar() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))

“Offenses Against Family & Children” have been increasing after COVD-19.

Line plots

In order to check changes in crime trends, the graph below illustrates, for each crime type, the change over time in the percentage of crimes of that crime type in all crimes.

#  table of the number of each crime type in each year
table_freq <- xtabs(~Arrest.Year+Arrest.Category,DF_WM)

# total number of crimes in each year
sum16 <- sum(table_freq[1,])
sum17 <- sum(table_freq[2,])
sum18 <- sum(table_freq[3,])
sum19 <- sum(table_freq[4,])
sum20 <- sum(table_freq[5,])
sum21 <- sum(table_freq[6,])
sum_lst <- c(sum16,sum17,sum18,sum19,sum20,sum21)

# table of the percentage of crimes in each year
table_rfreq <- table_freq/sum_lst

# convert table_rfreq into dataframe
DF_rfreq <- data.frame(table_rfreq)

# convert the data type of 'Arrest.Year' from factor into numeric
DF_rfreq$Arrest.Year <- as.character(DF_rfreq$Arrest.Year) %>% as.numeric()
ggplot(DF_rfreq, aes(x=Arrest.Year, y=Freq, color=Arrest.Category, group=Arrest.Category)) +
  ggtitle("Figure 10: Line plot of percentage of a crime vs year") + xlab("Year") + 
  ylab("Percentage of a crime relative to all crimes") +
  geom_line() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1), 
        legend.text = element_text(size=5),
        legend.key.size = unit(0.2, 'cm')) 

#install.packages("ggrepel") 
library(ggrepel)

# list of Arrest.Category sorted by descending order
tmp <- DF_rfreq[DF_rfreq$Arrest.Year==2021,]
tmp <- tmp$Arrest.Category[order(-tmp$Freq)]
tmp <- as.character(tmp) 

DF_rfreq_10 <- subset(DF_rfreq, Arrest.Category %in% tmp[c(1:10)])

ggplot(DF_rfreq_10, aes(x=Arrest.Year, y=Freq, color=Arrest.Category, group=Arrest.Category)) +
  ggtitle("Figure 11: Line plot of percentage of a crime vs year for top 10 crimes in 2021") + 
  xlab("Year") + 
  ylab("Percentage of a crime relative to all crimes") +
  geom_line() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1), legend.position = "none") +
  geom_text_repel(
    data = subset(DF_rfreq_10, Arrest.Year == 2021),
    aes(label = Arrest.Category),
    nudge_x = 1,
    nudge_y = 0.08,
    segment.alpha = 0.3,
    size = 2
  ) +
  lims(x = c(2016, 2023))

DF_rfreq_non10 <- subset(DF_rfreq, Arrest.Category %in% tmp[-c(1:10)])

ggplot(DF_rfreq_non10, aes(x=Arrest.Year, y=Freq, color=Arrest.Category, group=Arrest.Category)) +
  ggtitle("Figure 12: Line plot of percentage of a crime vs year for other crimes in Fig.11") + xlab("Year") + 
  ylab("Percentage of a crime relative to all crimes") +
  geom_line() +
  theme(axis.text.x = element_text(angle = 60, hjust = 1), legend.position = "none") +
  scale_y_log10() +
  geom_text_repel(
    data = subset(DF_rfreq_non10, Arrest.Year == 2021),
    aes(label = Arrest.Category),
    nudge_x = 30,
    nudge_y = 0.1,
    segment.alpha = 0.3,
    size = 2
  ) +
  lims(x = c(2016, 2022))
## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis

“Simple Assault” and “Offenses Against Family & Children” seem to be increasing after COVID-19. On the other hand, “Traffic Violations” seems to be decreasing after COVID-19.

Analysis

Since crime is likely to be a rare event, the number of occurrences per day of a given crime is expected to follow Poisson distribution. Poisson distribution is a distribution used to describe the distribution of the number of rare phenomena when a large number of them are observed. If a distribution follows Poisson distribution, and the average number of occurrences of the phenomenon is \(\lambda\), the probability that the phenomenon will occur \(x\) times is given by \[p(x) = \exp(-\lambda)\frac{\lambda^{x}}{x!}.\] In the following, we will estimate \(\lambda\) of each crime before and after COVID-19 to see there is a difference in crime trend.

Offenses Against Family & Children

Before COVID-19

The trend of “Offenses Against Family & Children,” Domestic Violence (DV), appears to have changed after COVID-19. The frequency table of DV before COVID-19 is as follows.

DF_WM_16_19 <- DF_WM[DF_WM$Arrest.Year%in%c(2016,2017,2018,2019),]
DF_WM_16_19_DV <- DF_WM_16_19[DF_WM_16_19$Arrest.Category=='Offenses Against Family & Children',]

# table of date and the number of occurrences
DV_day_16_19 <- sapply(unique(DF_WM_16_19_DV$Arrest.Date),
                       function(x){sum(DF_WM_16_19_DV$Arrest.Date==x)})
# of occurrences per day Frequency Relative frequency
0 1362 0.9322382
1 95 0.065024
2 2 0.0013689
3 0 0

We can calculate \(\lambda\) from the above table and \(\lambda = 0.0678\). We will plot the histogram and Poisson distribution with \(\lambda = 0.0678\) to check if they match or not.

x_DV <- 0:5
y_DV <- c(365*4+1-99,98,1,0,0,0)
fx <- dpois(x=x_DV, lambda=99/(365*4+1))
data_DV <- data.frame(x_DV, y_DV, fx)

ggplot(data_DV, aes(x=x_DV,y=y_DV)) +
  ggtitle("Figure 13: Histogram of DV in 2016 - 2019") + 
  xlab("Number of occurrences per day") + 
  ylab("Frequency") +
  geom_bar(stat = "identity") 

ggplot(data_DV) +
  ggtitle("Figure 14: Relative frequency histogram of DV in 2016 - 2019 \n and Poisson distribution with lambda = 0.0678") + 
  xlab("Number of occurrences per day") + 
  ylab("Relative frequency") +
  geom_bar(aes(x=x_DV,y=y_DV/(365*4+1)), stat = "identity") +
  geom_line(aes(x=x_DV,y=fx), color='red')

We can see that the Poisson distribution fits well with the histogram.

Next, we try to estimate \(99\%\) Confidence Interval of \(\lambda\). The variance of Poisson distribution is equal to its mean (\(\lambda\)). Therefore, \(99\%\) Confidence Interval of \(\lambda\) can be written as \[ \bar{x} - z_{*}\cdot\sqrt{\frac{\bar{x}}{n}} \leq \lambda \leq \bar{x} + z_{*}\cdot\sqrt{\frac{\bar{x}}{n}}, \] where \(\bar{x}\) is the sample mean, \(n\) is the sample size, and \(z_*\) is z-value corresponding to the \(99\%\) confidence interval, and the value is 2.58. From this expression, 99% Confidence Interval of \(\lambda\) for DV before COVID-19 is [0.05, 0.0856].

After COVID-19

The frequency table of DV after COVID-19 is as follows.

DF_WM_20_21 <- DF_WM[DF_WM$Arrest.Year%in%c(2020,2021),]
DF_WM_20_21_DV <- DF_WM_20_21[DF_WM_20_21$Arrest.Category=='Offenses Against Family & Children',]

# table of date and the number of occurrences
DV_day_20_21 <- sapply(unique(DF_WM_20_21_DV$Arrest.Date),
                       function(x){sum(DF_WM_20_21_DV$Arrest.Date==x)})
# of occurrences per day Frequency Relative frequency
0 555 0.7592339
1 47 0.0642955
2 1 0.001368
3 0 0
4 1 0.001368
5 0 0
0 0
44 0 0
45 1 0.001368
46 0 0
0 0
77 0 0
78 1 0.001368
79 0 0
0 0

There are two outliers (45 and 78) in the table. The dates of them are 2021/1/6 and 6/1/2020. Since these dates are correspond to “Capitol attack” and “George Floyd protests”, we will drop the value of these dates.

The calculated \(\lambda = 0.0725\). The histogram and the poisson distribution with \(\lambda = 0.0725\) are shown in Figure 16.

x_DV <- 0:5
y_DV <- c(365*2+1-49,47,1,0,1,0)
fx <- dpois(x=x_DV, lambda=53/(365*2+1))
data_DV <- data.frame(x_DV, y_DV, fx)

ggplot(data_DV, aes(x=x_DV,y=y_DV)) +
  ggtitle("Figure 15: Histogram of DV in 2020 - 2021") + 
  xlab("Number of occurrences per day") + 
  ylab("Frequency") +
  geom_bar(stat = "identity") 

ggplot(data_DV) +
  ggtitle("Figure 16: Reralive frequency histogram of DV in 2020 - 2021 \n and Pission distribution with lambda = 0.0725") + 
  xlab("Number of occurrences per day") + 
  ylab("Relative frequency") +
  geom_bar(aes(x=x_DV,y=y_DV/(365*2+1)), stat = "identity") +
  geom_line(aes(x=x_DV,y=fx), color='red')

The Poisson distribution fits well with the histogram.

99% Confidence Interval of \(\lambda\) for DV after COVID-19 is [0.0465, 0.0985].

Comparing Confidence Intervals

Figure 17 shows the Confidence Intervals before and after COVID-19. There is overlap in the Confidence Intervals, and it is not possible to say that there was a change in the \(\lambda\) of “Offenses Against Family & Children” before or after COVID-19.

x <- c('Before COVID-19','After COVID-19')
pre_covid_interval <- c(99/(365*4+1)-2.58*(99/(365*4+1)/(356*4+1))**0.5, 99/(365*4+1)+ 2.58*(99/(365*4+1)/(356*4+1))**0.5)
post_covid_interval <- c(53/(365*2+1) - 2.58*(53/(365*2+1)/(356*2+1))**0.5, 53/(365*2+1) + 2.58*(53/(365*2+1)/(356*2+1))**0.5)
data_CI_DV <- data.frame(x,pre_covid_interval,post_covid_interval)

ggplot(data_CI_DV) +
  ggtitle("Figure 17: 99% Confidence Interval of lambda for DV") + 
  xlab("") +
  ylab("99% Confidence Interval of lambda") + 
  coord_flip() +
  geom_linerange(aes(x=x[1], ymin=pre_covid_interval[1], ymax=pre_covid_interval[2])) +
  geom_linerange(aes(x=x[2], ymin=post_covid_interval[1], ymax=post_covid_interval[2]))

Traffic Violations

Before COVID-19

The trend of “Traffic Violations” also appears to have changed after COVID-19. The frequency table of Traffic Violations before COVID-19 is as follows.

DF_WM_16_19_TV <- DF_WM_16_19[DF_WM_16_19$Arrest.Category=='Traffic Violations',]

# table of date and the number of occurrences
TV_day_16_19 <- sapply(unique(DF_WM_16_19_TV$Arrest.Date),
                       function(x){sum(DF_WM_16_19_TV$Arrest.Date==x)})
# of occurrences per day Frequency Relative frequency
0 602 0.4123288
1 530 0.3627652
2 225 0.1540041
3 77 0.0527036
4 22 0.0150582
5 4 0.0027379
6 1 6.844627^{-4}
7 0 0

The calculated \(\lambda = 0.907\). The histogram and the poisson distribution with \(\lambda = 0.907\) are shown in Figure 19.

x_TV <- 0:10
y_TV <- c(365*4+1-859,530,225,77,22,4,1,0,0,0,0)
fx <- dpois(x=x_TV, lambda=sum(TV_day_16_19)/(365*4+1))
data_TV <- data.frame(x_TV, y_TV, fx)

ggplot(data_TV, aes(x=x_TV,y=y_TV)) +
  ggtitle("Figure 18: Histogram of traffic violations in 2016 - 2019") + 
  xlab("Number of occurrences per day") + 
  ylab("Frequency") +
  geom_bar(stat = "identity") 

ggplot(data_TV) +
  ggtitle("Figure 19: Relative frequency histogram of traffic violations in 2016 - 2019 \n and Pission distribution with lambda = 0.907") + 
  xlab("Number of occurrences per day") + 
  ylab("Relative frequency") +
  geom_bar(aes(x=x_TV,y=y_TV/(365*4+1)), stat = "identity") +
  geom_line(aes(x=x_TV,y=fx), color="red")

The Poisson distribution fits well with the histogram.

99% Confidence Interval of \(\lambda\) for Traffic Violations before COVID-19 is [0.842, 0.972].

After COVID-19

The frequency table of Traffic Violations after COVID-19 is as follows.

DF_WM_20_21_TV <- DF_WM_20_21[DF_WM_20_21$Arrest.Category=='Traffic Violations',]

# table of date and the number of occurrences
TV_day_20_21 <- sapply(unique(DF_WM_20_21_TV$Arrest.Date),
                       function(x){sum(DF_WM_20_21_TV$Arrest.Date==x)})
# of occurrences per day Frequency Relative frequency
0 545 0.745554
1 157 0.2147743
2 23 0.0314637
3 3 0.004104
4 3 0.004104
5 0 0

The calculated \(\lambda = 0.306\). The histogram and the poisson distribution with \(\lambda = 0.306\) are shown in Figure 21.

x_TV <- 0:10
y_TV <- c(365*2+1-186,157,23,3,3,0,0,0,0,0,0)
fx <- dpois(x=x_TV, lambda=sum(TV_day_20_21)/(365*2+1))
data_TV <- data.frame(x_TV, y_TV, fx)

ggplot(data_TV, aes(x=x_TV,y=y_TV)) +
  ggtitle("Figure 20: Histogram of traffic violations in 2020 - 2021") + 
  xlab("Number of occurrences per day") + 
  ylab("Frequency") +
  geom_bar(stat = "identity") 

ggplot(data_TV) +
  ggtitle("Figure 21: Relative frequency histogram of traffic violations in 2020 - 2021 and Pission distribution with lambda = 0.306") + 
  xlab("Number of occurrences per day") + 
  ylab("Relative frequency") +
  geom_bar(aes(x=x_TV,y=y_TV/(365*2+1)), stat = "identity") +
  geom_line(aes(x=x_TV,y=fx), color="red")

The Poisson distribution fits well with the histogram.

99% Confidence Interval of \(\lambda\) for Traffic Violations before COVID-19 is [0.253, 0.36].

Comparing Confidence Intervals

Figure 22 shows the Confidence Intervals before and after COVID-19. There is no overlap in the Confidence Intervals, and there may have been a change in the Traffic Violations’ lambda before and after COVID-19.

x <- c('Before COVID-19','After COVID-19')
pre_covid_interval <- c(sum(TV_day_16_19)/(365*4+1) - 2.58*(sum(TV_day_16_19)/(365*4+1)/(356*4+1))**0.5,
                        sum(TV_day_16_19)/(365*4+1) + 2.58*(sum(TV_day_16_19)/(365*4+1)/(356*4+1))**0.5)
post_covid_interval <- c(sum(TV_day_20_21)/(365*2+1) - 2.58*(sum(TV_day_20_21)/(365*2+1)/(356*2+1))**0.5,
                         sum(TV_day_20_21)/(365*2+1) + 2.58*(sum(TV_day_20_21)/(365*2+1)/(356*2+1))**0.5)
data_CI_TV <- data.frame(x,pre_covid_interval,post_covid_interval)

ggplot(data_CI_TV) +
  ggtitle("Figure 22: 99% Confidence Interval of lambda for Traffic Violations") + 
  ylab("99% Confidence Interval of lambda") + 
  xlab("") +
  coord_flip() +
  geom_linerange(aes(x=x[1], ymin=pre_covid_interval[1], ymax=pre_covid_interval[2])) +
  geom_linerange(aes(x=x[2], ymin=post_covid_interval[1], ymax=post_covid_interval[2]))